Round 1: Top-level cell type annotation

Can we distinguish cell types by surface proteins?

.file <- "result/step2/prot_bbknn.rds"

if.needed(.file, {
    
    .batches <- .fread(.prot.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_svd(.prot.data$mtx,
                                    .batches$batch,
                                    knn = 10, RANK = 10,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
  • nTconv : CD3+, CD4+, CD8-, CD25-/CD127+, CD45RA+/CD45RO-
  • mTconv : CD3+, CD4+, CD8-, CD25-/CD127+, CD45RA-/CD45RO+
  • nTreg : CD3+, CD4+, CD8-, CD25+/CD127-, CD45RA+/CD45RO-
  • mTreg : CD3+, CD4+, CD8-, CD25+/CD127-, CD45RA-/CD45RO+

Run annotation purely based on marker genes:

.file <- "result/step2/prot_annot.txt.gz"

if.needed(.file, {
    .pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
    .neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")
    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            row_file = .prot.data$row,
            col_file = .prot.data$col,
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            EM_TOL = 1e-6,
            EM_ITER = 500)
    annot.dt <- setDT(.annot.out$annotation)
    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(annot.dt) <- .col
    annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
    fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)

Two-dimensional density plot on the normalized CD marker expressions.

U <- .bbknn$U
D <- .bbknn$D
V <- .bbknn$factors.adjusted

prot.mtx <- pmax(exp(sweep(U, 2, D, `*`) %*% t(V)) - 1, 0)
prot.mtx <- apply(prot.mtx, 2, function(x) x/pmax(sum(x),1) * 1e4)

rownames(prot.mtx) <- readLines(.prot.data$row)
colnames(prot.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker_bbknn.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

Two-dimensional density plot on the raw CD marker concentrations.

prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

Will the same classification results hold in transcriptomic data?

.file <- "result/step2/gene_bbknn.rds"

if.needed(.file, {
    .batches <- .fread(.gene.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]
    .bbknn <- rcpp_mmutil_bbknn_svd(.gene.data$mtx,
                                    .batches$batch,
                                    knn = 10, RANK = 30,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE,
                                    NUM_THREADS = 8)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
degree.cutoff <- function(G, .cutoff = 3) {
    G.sub <- G
    n.remove <- sum(igraph::degree(G.sub) < .cutoff)
    while(n.remove > 0){
        vv <- igraph::V(G.sub)
        .retain <- vv[igraph::degree(G.sub) >= .cutoff]
        G.sub <- igraph::induced_subgraph(G.sub, .retain)
        n.remove <- sum(igraph::degree(G.sub) < .cutoff)
    }
    return(G.sub)
}

comp.cutoff <- function(G, .cutoff = 10){
    .comp <- igraph::components(G)
    .kk <- which(.comp$csize >= .cutoff) # valid component(s)
    .valid <- which(.comp$membership %in% .kk) # valid nodes
    G <- igraph::induced_subgraph(G, igraph::V(G)[.valid])
    return(G)
}

run.bbknn.umap <- function(.bbknn, .cells, res=.3, nrepeat=20, ...){

    .dt <- data.table(from = c(.bbknn$knn$src.index,.bbknn$knn$tgt.index),
                      to = c(.bbknn$knn$tgt.index,.bbknn$knn$src.index),
                      weight = c(.bbknn$knn$weight, .bbknn$knn$weight))
    
    .df <-
        .dt[`from` != `to`,
            .(weight = mean(`weight`)),
            by = .(`from`, `to`)] %>%
        as.data.frame()

    G <- igraph::graph_from_data_frame(.df, directed=FALSE)

    C <- list(quality = 0)
    for(r in 1:nrepeat){
        c.r <- igraph::cluster_leiden(G, resolution_parameter = res, objective_function = "modularity")
        message(paste("quality: ", c.r$quality, "\n"))
        if(r == 1 || max(c.r$quality) > max(C$quality)){
            C <- c.r
        }
    }

    .clust <- data.table(tag = .cells[as.integer(C$names)],
                         membership = C$membership)

    G <- comp.cutoff(G, 10)
    G <- degree.cutoff(G, 3)
    A <- igraph::as_adj(G, attr="weight")

    umap.A <- uwot::optimize_graph_layout(A, verbose = TRUE,
                                          n_sgd_threads = "auto",
                                          ...)

    .dt <-
        data.table(tag = .cells[as.integer(rownames(A))],
                   umap1 = umap.A[,1],
                   umap2 = umap.A[,2]) %>% 
        (function(x) left_join(.clust, x, by = "tag")) %>%
        as.data.table()
    return(.dt)
}
  1. Build a batch-balancing kNN graph

  2. Leiden graph-based clustering

  3. Construct UMAP the kNN backbone graph

.file <- "result/step2/gene_bbknn_leiden.txt.gz"
.cells <- readLines(.gene.data$col)
if.needed(.file, {
    .bbknn.umap <- run.bbknn.umap(.bbknn, .cells,
                                  min_dist=0,
                                  spread=3)
    fwrite(.bbknn.umap, .file)
})

.bbknn.umap <- fread(.file) %>%
    left_join(annot.dt, by = "tag") %>% 
    mutate(membership = as.factor(membership)) %>%
    as.data.table()

## remove cells too far from the centre
## in order to show more cells
.bbknn.umap[, x := scale(umap1)]
.bbknn.umap[, y := scale(umap2)]
.bbknn.umap[, d := sqrt(x^2 + y^2)]

.size <- .bbknn.umap[, .(.N), by = .(membership)]

.bbknn.umap <-
    .size[N > 0, .(membership)] %>%
    left_join(.bbknn.umap, by = "membership") %>%
    filter(`d` < 2.2) %>%
    as.data.table()

Clustering patterns in scRNA-seq generally agrees with the classification results based on surface proteins

p1 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300)

p2 <-
    .gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_gene_only_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

However, gene expression patterns may not be sufficient.

.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)

plt <- 
    .gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
    xlab("Leiden clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22) +
    scale_size_continuous("#cells", range=c(0,5)) +
    scale_fill_distiller("proportion",
                         palette="PuRd", direction = 1,
                         labels=function(x) round(sigmoid(x), 2))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_gene_only_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

Joint clustering analysis

.file <- "result/step2/full_bbknn.rds"

if.needed(.file, {

    .batches <- .fread(.data$col, col.names="tag")
    .batches[, c("barcode","batch") := tstrsplit(tag, "_")]

    .bbknn <- rcpp_mmutil_bbknn_svd(.data$mtx,
                                    .batches$batch,
                                    knn = 10, RANK = 30,
                                    EM_ITER = 20,
                                    TAKE_LN = TRUE,
                                    NUM_THREADS = 8)
    saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)

1. Run annotation on the joint BBKNN data

.file <- "result/step2/prot_annot_full.txt.gz"
if.needed(.file, {
    .pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
    .neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")

    .annot.out <-
        rcpp_mmutil_annotate_columns(
            pos_labels = list(p1=.pos.markers),
            r_neg_labels = list(n1=.neg.markers),
            row_file = .data$row,
            col_file = .data$col,
            r_U = .bbknn$U,
            r_D = .bbknn$D,
            r_V = .bbknn$factors.adjusted,
            EM_TOL = 1e-6,
            EM_ITER = 500)

    annot.dt <- setDT(.annot.out$annotation)
    .col <- c("tag", "celltype", "prob", "ln.prob")
    names(annot.dt) <- .col
    annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
    fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)

Two-dimensional density plot on the raw CD marker concentrations.

prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)

.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)

.file <- fig.dir %&% "/Fig_round1_cdmarker_full.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)

PDF

2. Graph-based clustering

.file <- "result/step2/full_bbknn_leiden.txt.gz"
.cells <- readLines(.data$col)
if.needed(.file, {
    .bbknn.umap <- run.bbknn.umap(.bbknn, .cells,
                                  res=1.5,
                                  nrepeat=50,
                                  min_dist=1e-4,
                                  spread=3)
    fwrite(.bbknn.umap, .file)
})

.bbknn.umap <- fread(.file) %>%
    left_join(annot.dt, by = "tag") %>% 
    mutate(membership = as.factor(membership)) %>%
    as.data.table()

## remove cells too far from the centre
## in order to show more cells
.bbknn.umap[, x := scale(umap1)]
.bbknn.umap[, y := scale(umap2)]
.bbknn.umap[, d := sqrt(x^2 + y^2)]

.size <- .bbknn.umap[, .(.N), by = .(membership)]

.bbknn.umap <-
    .size[N > 0, .(membership)] %>%
    left_join(.bbknn.umap, by = "membership") %>%
    filter(`d` < 2.2) %>%
    as.data.table()
p1 <-
    .gg.plot(.bbknn.umap[sample(.N)], aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300)

p2 <-
    .gg.plot(.bbknn.umap[sample(.N)], aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer(palette = "Paired")

plt <- p1 | p2
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_full_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)

PDF

.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)

plt <-
    .gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
    xlab("Leiden clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22) +
    scale_size_continuous("#cells", range=c(0,5)) +
    scale_fill_distiller("proportion",
                         palette="PuRd", direction = 1,
                         labels=function(x) round(sigmoid(x), 2))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_full_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

  • Annotate each cell cluster with the maximally matching cell type
## a. Assign cell types to clusters
.size <- .dt[, .(.N), by = .(membership, celltype)]
.argmax <- .size[order(`N`), tail(.SD, 1), by = .(membership)]
  • 18 modules

  • Remove clusters with too few cells (N \(<\) 500) and retain 13 clusters

.argmax[, `N` := NULL]
celltype.final <-
    fread(.file) %>%
    left_join(.argmax) %>%
    left_join(.size) %>%
    as.data.table() %>%
    na.omit()
## b. Remove small clusters
celltype.final <- celltype.final[`N` > 500]
  • 23,925 cells.

Check with Liang’s semi-supervised approach.

.overlap <- 
    fread("data/PRDM1/PRDM1_SL.csv.gz") %>%
    left_join(celltype.final) %>%
    na.omit()

.dt <- .overlap[, .(.N), by = .(cell_type_liang, celltype)]
.dt <- .dt[, lab := num.int(`N`)]

.aes <- aes(celltype, cell_type_liang, fill=`N`, label=`lab`)
plt <- .gg.plot(.dt, .aes) +
    geom_tile(linewidth=.1, colour="black") +
    geom_text(size=3) +
    scale_fill_distiller("N", direction=1) +
    theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1)) +
    xlab("CD annotation + Leiden") +
    ylab("Classifier (Liang)")
print(plt)

## remove cells too far from the centre
## in order to show more cells
.temp <- fread("data/PRDM1/PRDM1_SL.csv.gz")
.bbknn.umap <- celltype.final %>% left_join(.temp) %>% as.data.table()

.bbknn.umap[, membership := as.factor(`membership`)]
.bbknn.umap[, x := scale(umap1)]
.bbknn.umap[, y := scale(umap2)]
.bbknn.umap[, d := sqrt(x^2 + y^2)]
.bbknn.umap <- .bbknn.umap[`d` < 2.2 & `N` > 500]

p1 <-
    .gg.plot(.bbknn.umap[sample(.N)], aes(umap1, umap2, color=membership)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300)

p2 <-
    .gg.plot(.bbknn.umap[sample(.N)], aes(umap1, umap2, color=celltype)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer("annotation\n+Leiden", palette = "Paired")

p3 <-
    .gg.plot(.bbknn.umap[sample(.N)], aes(umap1, umap2, color=cell_type_liang)) +
    ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.7), dpi=300) +
    scale_color_brewer("supervised", palette = "Paired")

plt <- wrap_plots(p1, p2, p3, nrow=1)
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_refined_umap.pdf"
.gg.save(filename = .file, plot = plt, width=12, height=3)

PDF

DOWNLOAD: Cell type estimation

.ct.mem <- celltype.final[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]

.df <- 
    .ct.mem %>%
    mutate(row = celltype, col = membership,
           weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>% 
    order.pair(ret.tab = TRUE)

plt <-
    .gg.plot(.df, aes(col, row, size = `nc`)) +
    xlab("Leiden clustering") +
    ylab("cell type") +
    theme(legend.position = "bottom") +
    geom_point(pch=22, fill="black") +
    scale_size_continuous("#cells", range=c(0,5))
print(plt)

.file <- fig.dir %&% "/Fig_bbknn_refined_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)

PDF

3. Confirm by other marker genes/proteins

.markers <-
    c("FOXP3", "ID3", "BACH2", "CXCR3", "PRDM1", "SGK1", "TCF7", "LEF1",
      "SELL", "IL2RA", "IL7R", "IKZF2", "CCR6", "CCR4", "CCR7", "CTLA4",
      "HLA-DRA", "anti_CD25", "anti_CD127", "anti_CD183", "anti_CD196",
      "anti_CD197", "anti_CD194", "anti_CD45RA", "anti_CD45RO",
      "anti_HLA") %>%
    unique
.marker.hdr <- "result/step2/marker/marker"
.mkdir(dirname(.marker.hdr))
.marker.data <- fileset.list(.marker.hdr)
if.needed(.marker.data, {
    .marker.data <-
        rcpp_mmutil_copy_selected_rows(.data$mtx,
                                       .data$row,
                                       .data$col,
                                       .markers,
                                       .marker.hdr)
})

Note: these plots show expression values without batch adjustment

.markers <- sort(as.character(unique(marker.dt$marker)))
for(g in .markers){
    plt <- plot.marker.umap(g)
    print(plt)
    .file <- fig.dir %&% "/Fig_marker_" %&% g %&% "_umap.pdf"
    .gg.save(filename = .file, plot = plt, width=3.2, height=2.8)
}

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

PDF

Basic statistics for the first round annotation (23,925 cells)

.hash.info <- fread("data/cell_info_batch3-7.csv.gz") %>%
    rename(tag = cbid, subject = id) %>%
    select(-`V1`) %>%
    mutate(disease = substr(`subject`, 1, 2)) %>% 
    mutate(hash = gsub(pattern="[a-zA-Z\\-]+", replacement="", `hash`))

.sample.info <-
    readxl::read_xlsx("data/Hashing list MS Treg project.xlsx", 1) %>% 
    rename(Sample = `Cell type`) %>%
    mutate(hash = gsub("#","",`hash`))

.dt <-
    celltype.final %>%
    mutate(batch=as.integer(batch)) %>%
    left_join(.hash.info) %>%
    mutate(ct=factor(celltype, c("nTconv","mTconv","nTreg","mTreg"))) %>% 
    left_join(.sample.info) %>%
    na.omit()

Combining all the experiments

PDF

PDF

Total CD4 samples

PDF

PDF

CD25 enriched samples

PDF

PDF